################################################################################
######## CROSS VALIDATION OF DYNAMIC GROUP-LEVEL ITEM-RESPONSE MODEL ###########
################################################################################

rm(list=ls())
gc()
options(width=110)

### LIBRARIES
library(rstan)
library(foreign)
library(pbapply)
library(parallel)
library(TeachingDemos)
library(ggplot2)
library(reshape2)
library(car)
library(plyr)
library(dplyr)
library(survey)
library(stringr)
library(data.table)
library(matlab)
detectCores()

### FUNCTIONS
## size of objects in memory
.ls.objects <- function (pos = 1, pattern, order.by,
                         decreasing=FALSE, head=FALSE, n=5) {
    napply <- function(names, fn) sapply(names, function(x)
                                         fn(get(x, pos = pos)))
    names <- ls(pos = pos, pattern = pattern)
    obj.class <- napply(names, function(x) as.character(class(x))[1])
    obj.mode <- napply(names, mode)
    obj.type <- ifelse(is.na(obj.class), obj.mode, obj.class)
    obj.prettysize <- napply(names, function(x) {
        capture.output(print(object.size(x), units = "auto"))
    })
    obj.size <- napply(names, object.size)
    obj.dim <- t(napply(names, function(x)
                        as.numeric(dim(x))[1:2]))
    vec <- is.na(obj.dim)[, 1] & (obj.type != "function")
    obj.dim[vec, 1] <- napply(names, length)[vec]
    out <- data.frame(obj.type, obj.size, obj.prettysize, obj.dim)
    names(out) <- c("Type", "Size", "PrettySize", "Rows", "Columns")
    if (!missing(order.by))
        out <- out[order(out[[order.by]], decreasing=decreasing), ]
    if (head)
        out <- head(out, n)
    out
}
## shorthand
lsos <- function(..., n=10) {
    .ls.objects(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n)
}
OrdLev <- function (x) {
    x <- factor(x)
    if (nlevels(x) == 2 && grepl("non-", levels(x)[2], TRUE)) {
        x <- factor(x, levels=rev(levels(x)))
    }
    return(x)
}
LoopProgress <- function(index, interval=1) {
    if (index %% interval == 0) print (index)
}
calc.pars.normal <- function (x, p) {
    ## Calculates the parameters of a normal dist. given two values of the
    ## CDF (p[1] and p[2]) at given values (x[1] and p[2]), where x[1] < x[2].
    if (x[1] > x[2]) {
        x[1:2] <- x[2:1]
        p[1:2] <- p[2:1]
    }
    mean <- (x[1]*qnorm(p[2]) - x[2]*qnorm(p[1])) / (qnorm(p[2]) - qnorm(p[1]))
    sd <- (x[2] - x[1]) / (qnorm(p[2]) - qnorm(p[1]))
    out <- list(mean=mean, sd=sd)
    return(out)
}
FileName <- function (path=NULL, name=NULL, ext=NULL, replace=FALSE) {
    p <- paste(path, collapse = "")
    n <- paste(name, collapse = "")
    e <- paste(".", ext, sep = "")
    d <- format(Sys.Date(), "%y%m%d") 
    for (l in seq_along(letters)) {
        if (l > 1) old_file_name <- file_name
        file_name <- paste0(p, d, paste(n, letters[l], sep="-"), e)
        if (!any(grepl(file_name, list.files()))) {
            if (replace) file_name <- ifelse(l > 1, old_file_name, file_name)
            break
        }
    }
    cat("\nFile name:", file_name, "\n\n")
    return(file_name)
}
allNA <- function (x) all(is.na(x))
anyNA <- function (x) any(is.na(x))
allValid <- function (x) all(!is.na(x))
anyValid <- function (x) any(!is.na(x))
sumValid <- function (x) sum(!is.na(x))
whichValid <- function (x) which(!is.na(x))
POtoSouth11 <- function (pos, dta=st.info) {
    dta$South11[match(pos, dta$POAbrv)]
}
POtoSouth13 <- function (pos, dta=st.info) {
    dta$South13[match(pos, dta$POAbrv)]
}
NameToPO <- function (stnames, dta=st.info) {
    as.character(dta$POAbrv[match(tolower(stnames), tolower(dta$Name))])
}
CodeToPO <- function (stcodes, dta=st.info) {
    as.character(dta$POAbrv[match(stcodes, dta$ICPSRCode)])
}
POtoCode <- function (pos, dta=st.info) {
    return(as.integer(dta$ICPSRCode[match(pos, dta$POAbrv)]))
}
FIPStoPO <- function (fips, dta=st.info) {
    as.character(dta$POAbrv[match(fips, dta$fips/1000)])
}
CodeToName <- function (stcodes, dta=st.info) {
    return(as.character(dta$Name[match(stcodes, dta$ICPSRCode)]))
}
POtoRegion4 <- function (pos, dta=st.info) {
    as.character(dta$REGION4[match(pos, dta$POAbrv)])
}
Interpolate <- function (last.yr, next.yr, this.year, last.value, next.value) {
    w.l <- (next.yr - this.yr) / (next.yr - last.yr)
    w.n <- (this.yr - last.yr) / (next.yr - last.yr)
    return(last.value * w.l + next.value * w.n)
}
jagsFile <- function (jags.code, file.name, dir=".") {
    fileConn <- file(file.name)
    setwd(dir)
    writeLines(jags.code, fileConn)
    close(fileConn)
}
mypdf <- function(name, ...) {
    date <- format(Sys.Date(), "%y%m%d")
    pdf(paste(paste(name, collapse=""), date, ".pdf", sep=""), ...)
}
mysw <- function (...) {
    setwd(paste(..., sep = "/"))
}
NAtoF <- function (x) {
    y <- x
    y[is.na(y)] <- FALSE
    return(y)
}
NAto0 <- function (x) {
    y <- x
    y[is.na(y)] <- 0
    return(y)
}
NonNA <- function (x) {
    return(x[!is.na(x)])
}
PasteEval <- function(..., print = FALSE) {
    txt <- paste(..., sep = "")
    if (print) { cat("\n", txt, "\n") }
    eval(parse(text = txt), envir = parent.frame())
}
ReadLatest <- function (name, dir=".", format="dta", ...) {
    all.files <- list.files(dir)
    files <- all.files[grepl(name, all.files) & grepl(format, all.files)]
    dates <- sub(".*([0-9]{6}).*", "\\1", files)
    if (format == "dta") {
        data <- read.dta(paste(dir, files[which.max(dates)], sep="/"), ...)
    }
    if (format == "csv") {
        data <- read.csv(paste(dir, files[which.max(dates)], sep="/"), ...)
    }
    cat("\nOpening:", files[which.max(dates)], "\n")
    return (data)
}
SourceLatest <- function (name, dir=".", ...) {
    all.files <- list.files(dir)
    files <- all.files[grepl(name, all.files) & grepl("\\.R\\>", all.files)]
    dates <- sub(".*([0-9]{6}).*", "\\1", files)
    file <- paste(dir, files[which.max(dates)], sep="/")
    cat("\nOpening:", files[which.max(dates)], "\n")
    source(file, ...)
}
Formula <- function (vars, y=NULL, text=FALSE) {
    Y <- paste(y, "~")
    vars <- paste(vars, collapse = " + ")
    if (text) return(paste(Y, vars))
    else return(as.formula(paste(Y, vars)))
}
onDevin <- function (...) {
    user <- Sys.info()["user"]
    onD <- grepl("devin", user, ignore.case=TRUE) 
    return(onD)
}
onChris <- function (...) {
    user <- Sys.info()["user"]
    onD <- grepl("cwarshaw", user, ignore.case=TRUE) 
    return(onD)
}
onHMDC <- function (...) {
    user <- Sys.info()["nodename"]
    hmdc <- grepl("hmdc", user, ignore.case=TRUE) 
    return(hmdc)
}
onRSTUDIO <- function (...) {
    user <- Sys.info()["user"]
    rstudio <- grepl("rstudio", user, ignore.case=TRUE) 
    return(rstudio)
}
grepSub <- function (obj, pattern, drop=FALSE, ...) {
    sub <- grepl(pattern, x=obj, ...)
    if (drop) sub <- !sub
    return (obj[sub])
}
unform <- function (f) {
    x <- paste(as.character(f), collapse=" ")
    x <- strsplit(x, " \\+ ")
    x <- unlist(x)
    x[1] <- gsub("[~ ]", "", x[1])
    x
}
RakePartial <- PasteEval(sub("compress = FALSE",
                             "compress = FALSE, partial=TRUE", deparse(rake)))

poll.set <- "modernCV"

## Start syncing output to file
txt.out <- FileName(poll.set,, "out")
txtStart(file=txt.out)
print(poll.set)
getwd()

## Load state-level data
st.info <- read.csv("StateCodes.csv")
lower48 <- setdiff(as.character(st.info$POAbrv), c("AK", "HI", "DC"))

constant.item <- as.integer(TRUE) ## difficulty parameters constant over time?
change.at.election <- as.integer(FALSE) ## allow model to change at elections?

min.years <- 1
min.polls <- 1
poll.df <- read.dta("all_survey_keep140428.dta")
print(names(poll.df))
q.info <- read.csv("items_dimension140301.csv", stringsAsFactors=FALSE)
q.info
q.vars <- names(poll.df)[grep("cces2006_minimumwage", names(poll.df)):
                         ncol(poll.df)]
q.subset <- "AllLib"
if (q.subset == "GovIsLib") {
    q.vars <- q.info$item[q.info$lib_is_gov == 1]
    poll.df <- poll.df[, !names(poll.df) %in%
                       q.info$item[q.info$lib_is_gov == 0]]
}
if (q.subset == "AllLib") {
    q.vars <- q.info$item
}
q.vars <- sort(setdiff(q.vars, "schoolprayer_amendment"))
min.yr <- 1976
max.yr <- 2010

drop.cces <- TRUE
if (drop.cces) {
    poll.df <- subset(poll.df, organization != "CCES")
}
## State PO abbreviation
table(poll.df$StPOAbrv <- factor(poll.df$abb, exclude=c("", "NA")))
stopifnot(all(levels(poll.df$StPOAbrv) %in% levels(st.info$POAbrv)))
poll.df <- subset(poll.df, StPOAbrv %in% levels(st.info$POAbrv))
## Education
edu5.labels <- c("No High School Degree", "High School Grad",
                 "Some College", "College Graduate", "Graduate School")
edu4.labels <- c("No High School Degree", "High School Grad",
                 "Some College", "College Graduate")
poll.df <- mutate(poll.df,
                  Edu5=factor(education, labels=edu5.labels),
                  Edu5=ordered(Edu5, levels=edu5.labels),
                  Edu4=Recode(Edu5, 'c("College Graduate",
                        "Graduate School")= "College Graduate"'),
                  Edu4=ordered(Edu4, levels=edu4.labels))
## Female
table(poll.df$Female <- factor(poll.df$gender, labels=c("Male", "Female")))
## Race
nlevels.race <- 3
if (nlevels.race == 3) {
    race.labels <- c("White or Hispanic", "Black", "Other")
    if (grepl("modern", poll.set)) {
        poll.df$Race3 <- factor(poll.df$race_new, labels=race.labels)
    }
    poll.df$Race2 <- "Non-Black"
    poll.df$Race2 <- ifelse(poll.df$Race3 == "Black", "Black",
                            poll.df$Race2)
    is.na(poll.df$Race2) <- is.na(poll.df$Race3)
    poll.df$Race2 <- factor(poll.df$Race2)
}
if (nlevels.race == 4) {
    race.labels <- c("White", "Black", "Hispanic", "Other")
    poll.df$Race4 <- factor(poll.df$race, labels=race.labels)
}
## PID
poll.df$PID3 <- ordered(poll.df$pid3, labels=c("D", "I", "R"))
## Year
poll.df <- mutate(poll.df, Year=as.integer(year),
                  ElecYear=ifelse(month < 11, Year, Year + 1),
                  ElecYear=ifelse(month == 11 & !grepl("NES", source),
                      NA, ElecYear))
## weight variable
if (!"weight" %in% names(poll.df)) poll.df$weight <- 1
poll.df <- mutate(poll.df, weight=ifelse(weight > 0, weight, 1))
(fm.vars <- setdiff(names(poll.df), q.vars))
summary(poll.df)
min.yr <- max(c(min.yr, min(poll.df$Year, na.rm=TRUE)))
max.yr <- min(c(max.yr, max(poll.df$Year, na.rm=TRUE)))

## Drop years outside year range
(yr.range <- min.yr:max.yr)
if (change.at.election) {
    poll.df <- mutate(poll.df, YearFactor=factor(ElecYear, yr.range))
} else {
    poll.df <- mutate(poll.df, YearFactor=factor(Year, yr.range))
}
fm.vars <- c(fm.vars, "YearFactor")
dim(poll.df <- subset(poll.df, !is.na(YearFactor)))
gc()
## Drop variables with no valid responses
valid.vars <- names(poll.df)[sapply(poll.df, anyValid)]
(q.vars <- sort(intersect(q.vars, valid.vars)))
dim(poll.df <- subset(poll.df,, valid.vars))
## Drop rows with no valid question responses
dim(poll.df <- subset(poll.df, rowSums(!is.na(poll.df[q.vars])) > 0))
(q.vars <- sort(intersect(q.vars, names(poll.df))))
(fm.vars <- sort(intersect(fm.vars, names(poll.df))))
years.asked <- daply(data.frame(!is.na(poll.df)), ~poll.df$YearFactor, colSums)
(years.asked <- years.asked > 0)
(q.vars <- q.vars[colSums(years.asked[, q.vars], na.rm=TRUE) >= min.years])
dim(poll.df <- subset(poll.df,, c(fm.vars, q.vars)))

years.asked.melt <- melt(data.frame(years.asked[, q.vars],
                                    YearFactor=as.integer(rownames(years.asked))),
                         id="YearFactor")
poll.df$source.year <-
    interaction(poll.df$YearFactor, poll.df$source, drop=TRUE, lex.order=TRUE)
forms.asked <-
    daply(data.frame(!is.na(poll.df)), ~poll.df$source.year, colSums) > 0
forms.asked[, q.vars] <- 
    forms.asked[, q.vars] * apply(forms.asked[, q.vars], 1, sum)
q.vars <- q.vars[colSums(forms.asked[, q.vars], na.rm=TRUE) >= min.polls]

## Transform ordinal questions into binary above/below threshold indicators
for (q in seq_along(q.vars)) {
    LoopProgress(q, 10)
    levels.q <- levels(factor(poll.df[, q.vars[q]]))
    for (l in 1:(length(levels.q) - 1)) {
        poll.df[ncol(poll.df) + 1] <-
            as.integer(poll.df[, q.vars[q]] > levels.q[l])
        names(poll.df)[ncol(poll.df)] <-
            paste0(q.vars[q], ".gt", levels.q[l])
    }
}
(gt.vars <- sort(names(poll.df)[grep(".gt", names(poll.df))]))

### WEIGHTS
## Population targets
target.df <- read.dta("Targets1960to2010-140228.dta")
if (min.yr < min(target.df$Year)) {
    for (yr in (min(target.df$Year) - 1):min.yr) {
        new.df <- mutate(subset(target.df, Year==min(target.df$Year)),
                         Year=yr)
        target.df <- rbind(new.df, target.df)
    }
}
if (max.yr > max(target.df$Year)) {
    for (yr in (max(target.df$Year + 1)):max.yr) {
        new.df <- mutate(subset(target.df, Year==max(target.df$Year)),
                         Year=yr)
        target.df <- rbind(target.df, new.df)
    }
}
target.df <- mutate(target.df,
                    Female=factor(ifelse(Female, "Female", "Male"),
                        levels(poll.df$Female)),
                    Race2=ifelse(race3 == '2', 'Black', 'Non-Black'),
                    Race2=factor(Race2, levels(poll.df$Race2)),
                    Edu5=factor(as.integer(education2), labels=edu5.labels),
                    Edu5=ordered(Edu5, levels=edu5.labels),
                    Edu4=Recode(Edu5, 'c("College Graduate",
                          "Graduate School") = "College Graduate"'),
                    Edu4=ordered(Edu4, levels=edu4.labels))
target.ds <- svydesign(~1, weights=~Prop, data=target.df)

pre.wt.vars.ls <- list(NULL)
geo.var <- "StPOAbrv"
demo.vars <- NULL
geo.mod.vars <- NULL
geo.mod.prior.vars <- NULL
## Weight (PS or rake) w/in each group whose mean will be estimated
(covs <- c(geo.var, demo.vars))
covs.yr <- c(covs, "YearFactor")
(pre.wt.form.ls <- llply(pre.wt.vars.ls,
                         function (pwv) Formula(unique(c(pwv, covs, geo.var)))))
(ff <- Formula(c("0", covs)))
(ff.yr <- Formula(c("0", covs.yr)))

## Rows with no missing weighting variables or covariates and >0 valid responses
valid.gt <- !is.na(poll.df[, gt.vars])
table(rowSums(valid.gt))
(wt.covs.yr <- unique(c(unlist(pre.wt.vars.ls), covs.yr)))
use.rows <- (rowSums(!is.na(poll.df[, wt.covs.yr])) == length(wt.covs.yr) &
             rowSums(valid.gt) >= 1)
## Vars. used
gt.vars.use <- gt.vars[colSums(!is.na(poll.df[use.rows, gt.vars])) >= 1]
rm(list="valid.gt")
use.df <- data.frame(Obs=seq_len(sum(use.rows)), poll.df[use.rows, ])
gc()

training.fraction <- 1/4
nSims <- 10
s <- 1

for (s in s:nSims) {
    cat('\nSIMULATION', s, 'OF', nSims, '\n')
    
    ## sample observations
    set.seed(s)
    group.samples <- dlply(use.df, Formula(covs.yr), function (df) {
        n <- round(nrow(df)*training.fraction)
        sample(df$Obs, n)
    }, .progress = "text")
    training.obs <- sort(unlist(group.samples))
    validation.obs <- setdiff(use.df$Obs, training.obs)
    training.df <- use.df[training.obs, ]
    validation.df <- use.df[validation.obs, ]

################################################################################
#### PREPARE TRAINING DATA #####################################################
################################################################################
    
    MF <- model.frame(ff, data=training.df, na.action=return)
    MF.yr <- model.frame(ff.yr, data=training.df, na.action=return)
    ## Factor with level for each combination of covariate values
    group <- interaction(as.list(MF), lex.order=FALSE)
    group.yr <- interaction(as.list(MF.yr), lex.order=FALSE)
    ## Matrix of every factor combination (reordered to match group factor order)
    xtab.yr <- as.data.frame(table(MF.yr))
    xtab <- subset(xtab.yr, YearFactor == yr.range[1], -c(YearFactor, Freq))
    ## Dummy variable representation of contingency table
    XX <- model.matrix(ff, xtab)
    ## Response matrix
    YY <- as.matrix(training.df[, gt.vars.use])
    table(n.responses <- rowSums(!is.na(YY), na.rm=TRUE))

    ## design effects
    RakeWeight <- function (df, formula.list, target.design) {
        ds <- svydesign(~1, weights=~weight, data=df)
        pop.list <- llply(formula.list, svytable, design=target.design)
        rk <- RakePartial(ds, formula.list, pop.list)
        return (1/rk$prob)
    }
    de.df <- data.frame(row = seq_along(training.obs),
                        Nresponses = n.responses, 
                        training.df[, fm.vars])
    ## renormalize weights to be have mean of 1 within polls
    de.df <- group_by(de.df, source, YearFactor) %>%
        mutate(weight=weight/mean(weight))
    ## In each year, create weights that match specified targets
    de.df <- ddply(de.df, ~YearFactor, function (df) {
        ds <- subset(target.ds, target.ds$variables$Year==unique(df$YearFactor))
        df$new_weight <- (RakeWeight(df, pre.wt.form.ls, ds))
        df <- mutate(df, new_weight = new_weight/mean(new_weight, na.rm=TRUE))
        return(df)
    }, .progress = "text")
    ## calculate design effect within groups
    de.df <- mutate(PasteEval('group_by(de.df, ',
                              paste(covs.yr, collapse=', '), ')'),
                    de = 1 + (sd(new_weight)/mean(new_weight))^2,
                    de = ifelse(is.na(de), 1, de))
    (de.df <- plyr:::arrange(de.df, row))

    ## weighted sample size
    Nstar <- function (y, de, nr) {
        ob.adj <- as.numeric(!is.na(y)) / nr ## weight by inverse of num responses
        n.star <- ceiling(sum(ob.adj / de)) ## reduce N by design effect
        return(n.star)
    }
    ## weighted sum
    Sstar <- function (y, de, nr, wt) {
        y.bar.star <- weighted.mean(y, wt/nr, na.rm=TRUE)
        s.star <- round(Nstar(y, de, nr) * y.bar.star, digits=0)
        return(s.star)
    }
    xtab.yr$xtrow <- 1:nrow(xtab.yr)
    NN.dt <- aaply(YY, 2, function (q.var) {
        gdf <- grouped_df(data.frame(y = q.var, de.df), as.list(covs.yr),
                          drop = FALSE)
        sdf <- summarise(gdf, nstar = Nstar(y, de, Nresponses))
        mdf <- merge(x = xtab.yr, y = sdf, by = covs.yr, all.x = TRUE)
        mdf <- plyr:::arrange(mdf, xtrow)
        out <- mdf$nstar
        names(out) <- levels(group.yr)
        return(out)
    }, .progress = "text")
    gc()
    NN <- t(as.data.frame(NN.dt))
    gc()
    SS.dt <- aaply(YY, 2, function (q.var) {
        gdf <- grouped_df(data.frame(y = q.var, de.df), as.list(covs.yr),
                          drop = FALSE)
        sdf <- summarise(gdf, sstar = Sstar(y, de, Nresponses, new_weight))
        mdf <- merge(x = xtab.yr, y = sdf, by = covs.yr, all.x = TRUE)
        mdf <- plyr:::arrange(mdf, xtrow)
        out <- mdf$sstar
        names(out) <- levels(group.yr)
        return(out)
    }, .progress = "text")
    SS <- t(as.data.frame(SS.dt))
    gc()

    NN.cmb <- NN
    SS.cmb <- SS
    (qs.used <- gt.vars.use)
    train.props <- SS.cmb / NN.cmb
    ## Replace NA with 0
    NN.cmb[is.na(NN.cmb)] <- 0
    SS.cmb[is.na(SS.cmb)] <- 0
    stopifnot(all(SS.cmb - NN.cmb <= 0)) ## no more success than trials

    ## Years with surveys
    (svy.yrs <- yr.range[laply(yr.range, function (yr) {
        any(NN.cmb[grepl(yr, rownames(NN.cmb)), ] > 0)
    })])
    svy.yr.range <- min(svy.yrs):max(svy.yrs)

    T <- length(svy.yr.range) ## some years may not have a question
    Q <- length(qs.used)
    G <- nlevels(group)
    N <- sum(sum(NN.cmb > 0)) ## groups with data
    ## Vectors of counts and responses
    n_vec <- s_vec <- integer(N)
    names(n_vec) <- names(s_vec) <- seq_len(N)
    ## Missingness indicator array
    MMM <- array(1, dim=list(T, Q, G))
    pos <- 0
    for (t in 1:T) {
        print(yr <- svy.yr.range[t])
        if (!yr %in% svy.yrs) next
        yr.obs <- grep(yr, rownames(NN.cmb))
        for (q in 1:Q) {
            for (g in 1:G) {
                if (NN.cmb[yr.obs[g], q] == 0) next ## skip if no data
                pos <- pos + 1 
                MMM[t, q, g] <- 0 ## mark as not missing
                n_vec[pos] <- NN.cmb[yr.obs[g], q]
                s_vec[pos] <- SS.cmb[yr.obs[g], q]
                names(n_vec)[pos] <- names(s_vec)[pos] <-
                    paste(rownames(NN.cmb)[yr.obs[g]],
                          colnames(NN.cmb)[q], sep=" | ")
            }
        }
    }
    gc()
    quantile(n_vec, seq(.05, .95, .05))
    mean(!MMM) ## proportion of cells with data

    if (is.null(geo.mod.vars)) {
        zz.names <- list(svy.yr.range, levels(training.df[, geo.var]),
                         "Placeholder Zero")
        ZZ <- array(data=0, dim=llply(zz.names, length), dimnames=zz.names)
    } else {
        if (geo.var == "StPOAbrv") {
            ZZ <- melt(data.frame(st.dta), id.vars=c("StPOAbrv", "year"))
            ZZ <- acast(ZZ, year ~ StPOAbrv ~ variable)
            ZZ <- ZZ[as.character(svy.yr.range),, geo.mod.vars, drop=FALSE]
            dim(ZZ)
        }
    }
    if (is.null(geo.mod.prior.vars)) {
        zz.prior.names <- list(svy.yr.range, levels(training.df[, geo.var]),
                               "Placeholder Zero")
        ZZ.prior <- array(data=0, dim=llply(zz.prior.names, length),
                          dimnames=zz.prior.names)
    } else {
        if (geo.var == "StPOAbrv") {
            ZZ.prior <- melt(data.frame(st.dta), id.vars=c("StPOAbrv", "year"))
            ZZ.prior <- acast(ZZ.prior, year ~ StPOAbrv ~ variable)
            ZZ.prior <- ZZ.prior[as.character(svy.yr.range),,
                                 geo.mod.prior.vars, drop=FALSE]
            dim(ZZ.prior)
        }
    }

################################################################################
#### PREPARE VALIDATION DATA ###################################################
################################################################################

    MF.val <- model.frame(ff, data=validation.df, na.action=return)
    MF.val.yr <- model.frame(ff.yr, data=validation.df, na.action=return)
    ## Factor with level for each combination of covariate values
    group.val <- interaction(as.list(MF.val), lex.order=FALSE)
    group.val.yr <- interaction(as.list(MF.val.yr), lex.order=FALSE)
    ## Response matrix
    YY.val <- as.matrix(validation.df[, gt.vars.use])
    table(n.responses.val <- rowSums(!is.na(YY.val), na.rm=TRUE))
    de.val.df <- data.frame(row = seq_along(validation.obs),
                            Nresponses = n.responses.val, 
                            validation.df[, fm.vars])
    ## renormalize weights to be have mean of 1 within polls
    de.val.df <- group_by(de.val.df, source, YearFactor) %>%
        mutate(weight=weight/mean(weight))
    ## In each year, create weights that match specified targets
    de.val.df <- ddply(de.val.df, ~YearFactor, function (df) {
        ds <- subset(target.ds, target.ds$variables$Year==unique(df$YearFactor))
        df$new_weight <- RakeWeight(df, pre.wt.form.ls, ds)
        df <- mutate(df, new_weight = new_weight/mean(new_weight, na.rm=TRUE))
        return(df)
    }, .progress = "text")
    ## calculate design effect within groups
    de.val.df <- mutate(PasteEval('group_by(de.val.df, ',
                                  paste(covs.yr, collapse=', '), ')'),
                        de = 1 + (sd(new_weight)/mean(new_weight))^2,
                        de = ifelse(is.na(de), 1, de))
    (de.val.df <- plyr:::arrange(de.val.df, row))

    NN.val.dt <- aaply(YY.val, 2, function (q.var) {
        gdf <- grouped_df(data.frame(y = q.var, de.val.df), as.list(covs.yr),
                          drop = FALSE)
        sdf <- summarise(gdf, nstar = Nstar(y, de, Nresponses))
        mdf <- merge(x = xtab.yr, y = sdf, by = covs.yr, all.x = TRUE)
        mdf <- plyr:::arrange(mdf, xtrow)
        out <- mdf$nstar
        names(out) <- levels(group.val.yr)
        return(out)
    }, .progress = "text")
    gc()
    NN.val <- t(as.data.frame(NN.val.dt))
    gc()
    SS.val.dt <- aaply(YY.val, 2, function (q.var) {
        gdf <- grouped_df(data.frame(y = q.var, de.val.df), as.list(covs.yr),
                          drop = FALSE)
        sdf <- summarise(gdf, sstar = Sstar(y, de, Nresponses, new_weight))
        mdf <- merge(x = xtab.yr, y = sdf, by = covs.yr, all.x = TRUE)
        mdf <- plyr:::arrange(mdf, xtrow)
        out <- mdf$sstar
        names(out) <- levels(group.val.yr)
        return(out)
    }, .progress = "text")
    SS.val <- t(as.data.frame(SS.val.dt))
    gc()
    NN.val.cmb <- NN.val
    SS.val.cmb <- SS.val
    val.props <- SS.val.cmb / NN.val.cmb
    ## Replace NA with 0
    NN.val.cmb[is.na(NN.val.cmb)] <- 0
    SS.val.cmb[is.na(SS.val.cmb)] <- 0
    stopifnot(all(SS.val.cmb - NN.val.cmb <= 0)) ## no more success than trials

    stan.dyn.data <- list(n_vec = n_vec, ## response counts
                          s_vec = s_vec, ## group counts
                          XX = XX, ## indicator matrix of contingency table
                          ZZ = ZZ, ## design matrices for geographic model
                          ZZ_prior = ZZ.prior,
                          MMM = MMM, ## missingness array (T x Q x G)
                          G = G, ## number of covariate groups
                          Q = Q, ## number of questions (items)
                          T = T, ## number of time units (years)
                          N = N, ## number of observed group-question cells
                          P = ncol(XX), ## number of hierarchical parameters
                          S = dim(ZZ)[[2]], ## number of geographic units
                          H = dim(ZZ)[[3]], ## number of geographic-level pred.
                          H_prior = dim(ZZ.prior)[[3]],
                          separate_years=0, ## model pooled over time
                          constant_item=1, ## difficulties constant
                          D = 1
                          )

    save(stan.dyn.data, train.props, group, val.props,
         file = paste0('CV-StanData', s, '.RData'))
}
